W.E.L. Pena, P.R. De Massaguer, A.D.G. Zuniga, and S.H. Saraiva (2011).
"Modeling the Growth Limit of Alicyclobacillus Acidoterrestris CRA7152
in Apple Juice: Effect of pH, Brix, Temperature, and Nisin Concentration,"
Journal of Food Processing and Preservation, Vol. 35, pp. 509-517.
Presence/Absence of growth of CRA7152 in apple juice
as a function of pH (3.5-5.5), Brix (11-19), temperature (25-50C),
and Nisin concentration (0-70)
Read paper here
pH 5-8
Nisin concentration (13-16)
Temperature 22-24
Brix Concentration 30-32
Growth 40 / 1=Yes, 0=No /
In [117]:
library(MASS)
library(ROCR)
In [118]:
data <- read.table("apple_juice.dat",
sep="",
col.names=c("pH", "NisinConcentration", "Temperature",
"BrixConcentration", "Growth"))
In [119]:
head(data)
In [120]:
summary(data)
nrow(data)
print(nrow(subset(data, Growth == 1)))
print(nrow(subset(data, Growth == 0)))
jpeg("label_dist.jpg", width = 750, height = 750)
barplot(table(data$Growth), main="Response variable distribution",
xlab="Growth")
dev.off()
In [121]:
jpeg("var_dist.jpg", width = 750, height = 750)
par(mfrow=c(2,2))
for(i in 1:4) {
hist(data[,i], main = names(data)[i], col="gray", xlab = names(data)[i])
}
dev.off()
In [122]:
jpeg("var_box.jpg", width = 750, height = 750)
par(mfrow=c(2,2))
for(i in 1:4) {
boxplot(data[,i], main=names(data)[i])
}
dev.off()
In [123]:
data$Growth = as.factor(data$Growth)
data[, 1:4] = scale(data[, 1:4])
head(data)
In [124]:
library(Amelia)
library(mlbench)
missmap(data, col=c("blue", "red"), legend=FALSE)
In [125]:
library(corrplot)
jpeg("corr.jpg", width = 750, height = 750)
correlations <- cor(data[,1:4])
corrplot.mixed(correlations, lower.col = "black")
#corrplot(correlations, method="circle")
dev.off()
In [126]:
jpeg("scatter.jpg", width = 750, height = 750)
pairs(data, col=data$Growth)
dev.off()
In [127]:
jpeg("density.jpg", width = 750, height = 750)
library(caret)
x <- data[,1:4]
y <- data[,5]
scales <- list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=x, y=y, plot="density", scales=scales)
dev.off()
In [128]:
par(mfrow=c(2,2))
cdplot(Growth ~ pH, data = data)
cdplot(Growth ~ NisinConcentration, data = data)
cdplot(Growth ~ Temperature, data = data)
cdplot(Growth ~ BrixConcentration, data = data)
In [137]:
subsample_negative = function(data) {
neg = subset(data, Growth == 0)
pos = subset(data, Growth == 1)
neg_n = nrow(neg)
pos_n = nrow(pos)
if (neg_n > pos_n) {
neg_sample = neg[sample(nrow(neg), pos_n),]
return(rbind(neg_sample, pos))
} else {
pos_sample = neg[sample(nrow(pos), neg_n),]
return(rbind(neg, pos_sample))
}
}
leave_one_out_sample = function(formula) {
x <- c()
y <- c()
for (row in 1:nrow(data)) {
test <- data[row, 1:5]
if (row > 1) {
if (row == nrow(data)) {
train <- data[1:(row-1), 1:5]
} else {
train <- rbind(data[1:(row-1), 1:5], data[(row+1):nrow(data), 1:5])
}
} else {
train <- data[(row + 1):nrow(data), 1:5]
}
stopifnot(nrow(train) == 73)
train <- subsample_negative(train)
stopifnot(nrow(train) == 50 || nrow(train) == 52)
model.now <- glm(formula, data = train, family = binomial())
fitted.results <- predict(model.now, newdata = test, type='response')
y <- append(y, fitted.results)
fitted.results <- ifelse(fitted.results > 0.5, 1, 0)
#misClasificError <- mean(fitted.results != test$Growth)
x <- append(x, fitted.results)
}
stopifnot(length(x) == 74)
print(mean(x == data$Growth))
precision <- posPredValue(as.factor(x), data$Growth, positive="1")
print(precision)
recall <- sensitivity(as.factor(x), data$Growth, positive="1")
print(recall)
F1 <- (2 * precision * recall) / (precision + recall)
print(F1)
pr <- prediction(y, data$Growth)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
jpeg("roc_better_model.jpg", width = 750, height = 750)
plot(prf)
dev.off()
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
}
leave_one_out_sample(Growth ~ pH + NisinConcentration + Temperature + BrixConcentration)
#model.glm.1 <- glm(Growth ~ pH + NisinConcentration + Temperature + BrixConcentration,
# data = data, family = binomial())
#model.glm.1
#coef(model.glm.1)
In [140]:
leave_one_out_sample(Growth ~ NisinConcentration + pH + Temperature +
BrixConcentration + pH:BrixConcentration + NisinConcentration:BrixConcentration +
Temperature:BrixConcentration + NisinConcentration:Temperature)
In [139]:
summary(model.glm.1)
In [103]:
jpeg("simple_model.jpg", width = 750, height = 750)
layout(matrix(1:4,ncol=2))
plot(model.glm.1)
dev.off()
In [104]:
# Make predictions
probabilities <- predict(model.glm.1, data, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, 1, 0)
# Model accuracy
mean(predicted.classes==data$Growth)
In [105]:
owidth={0.9\columnwidth}
In [106]:
model.glm.2 <- glm(Growth ~ 1, data = data, family = binomial())
# Make predictions
probabilities <- predict(model.glm.2, data, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, 1, 0)
# Model accuracy
mean(predicted.classes==data$Growth)
stepAIC(model.glm.2, scope= ~ pH * NisinConcentration * Temperature * BrixConcentration)
In [107]:
model.best <- glm(formula = Growth ~ NisinConcentration + pH + Temperature +
BrixConcentration + pH:BrixConcentration + NisinConcentration:BrixConcentration +
Temperature:BrixConcentration + NisinConcentration:Temperature,
family = binomial(), data = data)
model.best
In [108]:
summary(model.best)
In [109]:
coef(model.best)
In [110]:
layout(matrix(1:4,ncol=2))
plot(model.best)
In [111]:
# Make predictions
probabilities <- predict(model.best, data, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, 1, 0)
# Model accuracy
mean(predicted.classes==data$Growth)
In [112]:
model.all <- glm(formula = Growth ~.,
family = binomial(), data = data)
model.all
In [113]:
model.all$coef
In [86]:
layout(matrix(1:4,ncol=2))
plot(model.all)
In [87]:
# Make predictions
probabilities <- predict(model.all, data, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, 1, 0)
# Model accuracy
mean(predicted.classes==data$Growth)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: